package com.cmge.ad.spider.pic.gaoxiao;

import java.util.ArrayList;
import java.util.List;

import org.springframework.util.StringUtils;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;

import com.cmge.ad.model.Picture;
import com.cmge.ad.spider.pipeline.MysqlAlbumPicturePipeline;

/**
 * @desc	内涵村搞笑图 图片抓取   已爬

 * 			http://www.nhcun.com/show/list_2.html
 * 
 * @author	ljt
 * @time	2014-12-30 下午7:51:34
 */
public class NeiHanCunGaoXiaoImagCrawl implements PageProcessor {
	
    private Site site = Site.me().setRetryTimes(3).setSleepTime(100);

    public static final String URL_LIST = "list_\\w+";
    
    // 列表最大值
    private int max = 167;
    
    @Override
    public void process(Page page) {
    	if (page.getUrl().regex(URL_LIST).match()) {
    		// 检索当前页面所有段子
        	List<Selectable> cList = page.getHtml().xpath("//div[@class='leftcont']//dl//div[@class='contimg']//img").nodes();
        	List<Selectable> tList = page.getHtml().xpath("//div[@class='leftcont']//dl//h3/a/text()").nodes();
        	if(null != cList && cList.size() > 0){
    			List<Picture> picList = new ArrayList<Picture>();
    			int i = 0;
    			for(Selectable str : cList){
    				String style = str.xpath("/img/@style").get().toString();
    				String url = str.xpath("/img/@src").get().toString();
    				if(!StringUtils.isEmpty(url) && !StringUtils.isEmpty(style)){
    					Picture picture = new Picture();
    					picture.setDesc(tList.get(i).get());
    					picture.setMaxImageUrl("http://www.nhcun.com"+url);
    					picture.setMinImageUrl("http://www.nhcun.com"+url);
    					picture.setSource("neihancun_gaoxiaotu");
    					picList.add(picture);
    					i++;
    				}
    			}
    			page.putField("picList",picList);
    			page.putField("type",3);
    		}
    		
    		// 当前页
    		String url = page.getUrl().get();
    		int current = 1;
    		try {
    			current = Integer.parseInt(url.substring(url.lastIndexOf("_")+1,url.lastIndexOf(".html")));
    			if(current == 1){
    				for(int i = 1;i<max;i++){
    					page.addTargetRequest("http://www.nhcun.com/show/list_"+(i+1)+".html");
    				}
    			}
    		} catch (Exception e) {
    			e.printStackTrace();
    		}
    		
    		System.out.println("current is "+current);
        }
    }

    @Override
    public Site getSite() {
        return site;
    }

    public static void main(String[] args) throws Exception {
    	Spider qsSpider = Spider.create(new NeiHanCunGaoXiaoImagCrawl())
    					.addUrl("http://www.nhcun.com/show/list_1.html")
//    					.addPipeline(new RedisPipeline())
//    					.addPipeline(new JsonFilePipeline())
//    					.addPipeline(new JsonPipeline())
    					.addPipeline(new MysqlAlbumPicturePipeline())
    					.thread(1);
    	qsSpider.start();
    }	
	
	
}
